# Remember: library imports are ALWAYS at the top of the script, no exceptions!
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from itertools import product
from scipy.stats import skewnorm
from datetime import datetime
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
# for better resolution plots
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'
####### Disable warnings #######
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
# Setting seaborn style
sns.set()
Context¶
The data we will be using through the pratical classes comes from a small relational database whose schema can be seen below:

Reading the Data¶
# path to database
my_path = os.path.join("..", "data", "datamining.db")
# connect to the database
conn = sqlite3.connect(my_path)
# the query
query = """
select
age,
income,
frq,
rcn,
mnt,
clothes,
kitchen,
small_appliances,
toys,
house_keeping,
dependents,
per_net_purchase,
g.gender,
e.education,
m.status,
r.description
from customers as c
join genders as g on g.id = c.gender_id
join education_levels as e on e.id = c.education_id
join marital_status as m on m.id = c.marital_status_id
join recommendations as r on r.id = c.recommendation_id
order by c.id;
"""
df = pd.read_sql_query(query, conn)
Make a copy of your original dataset¶
why?
df_original = df.copy()
Metadata¶
- id - The unique identifier of the customer
- age - The year of birht of the customer
- income - The income of the customer
- frq - Frequency: number of purchases made by the customer
- rcn - Recency: number of days since last customer purchase
- mnt - Monetary: amount of € spent by the customer in purchases
- clothes - Number of clothes items purchased by the customer
- kitchen - Number of kitchen items purchased by the customer
- small_appliances - Number of small_appliances items purchased by the customer
- toys - Number of toys items purchased by the customer
- house_keeping - Number of house_keeping items purchased by the customer
- dependents - Binary. Whether or not the customer has dependents
- per_net_purchase - Percentage of purchases made online
- education - Education level of the customer
- status - Marital status of the customer
- gender - Gender of the customer
- description - Last customer's recommendation description
Replicating modifications from previous notebook¶
# replace "" by nans
df.replace("", np.nan, inplace=True)
# count of missing values
df.isna().sum()
age 0 income 46 frq 0 rcn 0 mnt 0 clothes 0 kitchen 0 small_appliances 0 toys 0 house_keeping 0 dependents 282 per_net_purchase 0 gender 0 education 47 status 177 description 0 dtype: int64
# fix wrong dtypes
df.dependents = df.dependents.astype("boolean") # converting to "boolean" over "bool" allows preservation of NaNs
# check dataset data types again
df.dtypes
age int64 income float64 frq int64 rcn int64 mnt int64 clothes int64 kitchen int64 small_appliances int64 toys int64 house_keeping int64 dependents boolean per_net_purchase int64 gender object education object status object description object dtype: object
# check descriptive statistics again
df.describe(include="all").T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 8998.0 | NaN | NaN | NaN | 1966.05968 | 17.296552 | 1936.0 | 1951.0 | 1966.0 | 1981.0 | 1996.0 |
| income | 8952.0 | NaN | NaN | NaN | 69963.550827 | 27591.556226 | 10000.0 | 47741.0 | 70030.5 | 92218.0 | 140628.0 |
| frq | 8998.0 | NaN | NaN | NaN | 19.848077 | 10.903435 | 3.0 | 10.0 | 17.0 | 28.0 | 59.0 |
| rcn | 8998.0 | NaN | NaN | NaN | 62.469771 | 69.761802 | 0.0 | 26.0 | 53.0 | 79.0 | 549.0 |
| mnt | 8998.0 | NaN | NaN | NaN | 622.162814 | 646.768205 | 6.0 | 63.0 | 383.0 | 1076.0 | 3052.0 |
| clothes | 8998.0 | NaN | NaN | NaN | 50.446655 | 23.422249 | 1.0 | 33.0 | 51.0 | 69.0 | 99.0 |
| kitchen | 8998.0 | NaN | NaN | NaN | 7.039675 | 7.848139 | 0.0 | 2.0 | 4.0 | 10.0 | 75.0 |
| small_appliances | 8998.0 | NaN | NaN | NaN | 28.524116 | 12.586437 | 1.0 | 19.0 | 28.0 | 37.0 | 74.0 |
| toys | 8998.0 | NaN | NaN | NaN | 7.036897 | 7.924422 | 0.0 | 2.0 | 4.0 | 10.0 | 62.0 |
| house_keeping | 8998.0 | NaN | NaN | NaN | 6.929984 | 7.882655 | 0.0 | 2.0 | 4.0 | 9.0 | 77.0 |
| dependents | 8716 | 2 | True | 6164 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| per_net_purchase | 8998.0 | NaN | NaN | NaN | 42.428984 | 18.495742 | 4.0 | 28.0 | 45.0 | 57.0 | 88.0 |
| gender | 8998 | 2 | M | 5784 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| education | 8951 | 6 | Graduation | 4429 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| status | 8821 | 6 | Married | 3273 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| description | 8998 | 5 | OK nice! | 3434 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Define metric and non-metric features. Why?
non_metric_features = ["education", "status", "gender", "dependents", "description"]
metric_features = df.columns.drop(non_metric_features).to_list()
Data Preprocessing¶
Fill missing values¶
https://statisticsbyjim.com/basics/missing-data/
Read: Section 2.4.2
Han, J., Pei, J., & Tong, H. (2022). Data mining: Concepts and Techniques (4th ed.). Morgan Kaufmann.
Read: Chapter 4
GarcÃa, S., Luengo, J., & Herrera, F. (2014). Data preprocessing in data mining. Springer.
How can we fill missing values?¶
Measures of central tendency¶
# Creating a copy to apply central tendency measures imputation
df_central = df.copy()
# count of missing values
df_central.isna().sum()
age 0 income 46 frq 0 rcn 0 mnt 0 clothes 0 kitchen 0 small_appliances 0 toys 0 house_keeping 0 dependents 282 per_net_purchase 0 gender 0 education 47 status 177 description 0 dtype: int64
means = df_central[metric_features].mean()
means
age 1966.059680 income 69963.550827 frq 19.848077 rcn 62.469771 mnt 622.162814 clothes 50.446655 kitchen 7.039675 small_appliances 28.524116 toys 7.036897 house_keeping 6.929984 per_net_purchase 42.428984 dtype: float64
# df_central.median(numeric_only=True)
# Instead of saying numeric_only = True,
# you can explicitly give a df filtered to only the numeric features
# [Alternative]
medians = df_central[metric_features].median()
medians
age 1966.0 income 70030.5 frq 17.0 rcn 53.0 mnt 383.0 clothes 51.0 kitchen 4.0 small_appliances 28.0 toys 4.0 house_keeping 4.0 per_net_purchase 45.0 dtype: float64
Mean or Median?¶
#####################
## You don't need to understand this code
#####################
skew_demo = np.round(skewnorm.rvs(180, size=1500, random_state=68410237)*100, 0).astype(int)
skew_md = np.median(skew_demo)
skew_mn = np.mean(skew_demo)
sns.set_theme(style="white", palette=None)
fig, ax = plt.subplots(figsize=(8,5))
sns.histplot(skew_demo, bins=20,
ax=ax, color='tab:orange')
ax.axvline(skew_md, color='black', linewidth=4,
label='Median: {}'.format(int(skew_md)))
ax.axvline(skew_mn, color='blue', linestyle='dashed', linewidth=4,
label='Mean: {}'.format(int(skew_mn)))
ax.legend(handlelength=5)
ax.set_title("Histogram")
plt.show()
sns.set_theme(style="white")
The mean is the average of all values. The median is the middle value when a data set is ordered from least to greatest. If a data set has an odd number of values, the median is the middle value. If a data set has an even number of values, the median is the average of the two middle values.
What about non-numeric features?¶
# Check each step of these commands
modes = df_central[non_metric_features].mode().loc[0]
modes
education Graduation status Married gender M dependents True description OK nice! Name: 0, dtype: object
# Fill NaNs on df_central
df_central[metric_features] = df_central[metric_features].fillna(medians)
df_central[non_metric_features] = df_central[non_metric_features].fillna(modes)
df_central.isna().sum() # checking how many NaNs we still have
age 0 income 0 frq 0 rcn 0 mnt 0 clothes 0 kitchen 0 small_appliances 0 toys 0 house_keeping 0 dependents 0 per_net_purchase 0 gender 0 education 0 status 0 description 0 dtype: int64
KNN¶
# Creating new df copy to explore neighbordhood imputation
df_neighbors = df.copy()
# Seeing rows with NaNs
nans_index = df_neighbors.isna().any(axis=1)
df_neighbors[nans_index]
| age | income | frq | rcn | mnt | clothes | kitchen | small_appliances | toys | house_keeping | dependents | per_net_purchase | gender | education | status | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 1955 | 93571.0 | 26 | 10 | 888 | 60 | 10 | 19 | 6 | 5 | True | 35 | F | Master | NaN | OK nice! |
| 61 | 1968 | NaN | 14 | 90 | 184 | 95 | 1 | 3 | 1 | 0 | True | 59 | F | 2nd Cycle | Married | Meh... |
| 67 | 1981 | 60457.0 | 9 | 73 | 63 | 37 | 2 | 51 | 6 | 3 | <NA> | 57 | M | PhD | Married | Meh... |
| 70 | 1944 | 116259.0 | 28 | 35 | 1279 | 17 | 6 | 54 | 3 | 20 | False | 11 | M | Graduation | NaN | OK nice! |
| 73 | 1967 | 75274.0 | 16 | 67 | 263 | 88 | 3 | 6 | 1 | 2 | <NA> | 66 | M | Graduation | Single | Kind of OK |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8932 | 1959 | NaN | 20 | 71 | 716 | 67 | 5 | 25 | 3 | 1 | False | 27 | M | Graduation | Married | Meh... |
| 8942 | 1950 | 92746.0 | 34 | 98 | 1399 | 50 | 5 | 36 | 5 | 4 | <NA> | 28 | M | Master | Married | Take my money!! |
| 8964 | 1978 | 44661.0 | 6 | 49 | 33 | 18 | 18 | 35 | 16 | 13 | True | 42 | M | 1st Cycle | NaN | OK nice! |
| 8972 | 1976 | NaN | 17 | 9 | 254 | 70 | 1 | 28 | 1 | 1 | True | 59 | M | Graduation | Divorced | Meh... |
| 8992 | 1954 | 87399.0 | 25 | 1 | 837 | 56 | 8 | 27 | 8 | 1 | <NA> | 47 | M | Graduation | Married | Kind of OK |
542 rows × 16 columns
# KNNImputer - only works for numerical variables. Fill NaNs on df_neighbors
imputer = KNNImputer(n_neighbors=5, weights="uniform")
df_neighbors[metric_features] = imputer.fit_transform(df_neighbors[metric_features])
# See rows with NaNs imputed
df_neighbors.loc[nans_index, metric_features]
# [Alternative]
# df_neighbors[nans_index]
| age | income | frq | rcn | mnt | clothes | kitchen | small_appliances | toys | house_keeping | per_net_purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 1955.0 | 93571.0 | 26.0 | 10.0 | 888.0 | 60.0 | 10.0 | 19.0 | 6.0 | 5.0 | 35.0 |
| 61 | 1968.0 | 63143.0 | 14.0 | 90.0 | 184.0 | 95.0 | 1.0 | 3.0 | 1.0 | 0.0 | 59.0 |
| 67 | 1981.0 | 60457.0 | 9.0 | 73.0 | 63.0 | 37.0 | 2.0 | 51.0 | 6.0 | 3.0 | 57.0 |
| 70 | 1944.0 | 116259.0 | 28.0 | 35.0 | 1279.0 | 17.0 | 6.0 | 54.0 | 3.0 | 20.0 | 11.0 |
| 73 | 1967.0 | 75274.0 | 16.0 | 67.0 | 263.0 | 88.0 | 3.0 | 6.0 | 1.0 | 2.0 | 66.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8932 | 1959.0 | 85643.4 | 20.0 | 71.0 | 716.0 | 67.0 | 5.0 | 25.0 | 3.0 | 1.0 | 27.0 |
| 8942 | 1950.0 | 92746.0 | 34.0 | 98.0 | 1399.0 | 50.0 | 5.0 | 36.0 | 5.0 | 4.0 | 28.0 |
| 8964 | 1978.0 | 44661.0 | 6.0 | 49.0 | 33.0 | 18.0 | 18.0 | 35.0 | 16.0 | 13.0 | 42.0 |
| 8972 | 1976.0 | 63932.4 | 17.0 | 9.0 | 254.0 | 70.0 | 1.0 | 28.0 | 1.0 | 1.0 | 59.0 |
| 8992 | 1954.0 | 87399.0 | 25.0 | 1.0 | 837.0 | 56.0 | 8.0 | 27.0 | 8.0 | 1.0 | 47.0 |
542 rows × 11 columns
Keep central tendency imputation¶
Para a 2º observação:
KNN = 63k
Mean = 70k
Median = 70k
Why?
Porque a mediana é mais robusta a outliers e o KNN está mau pq não teve
MinMaxScaler.
# let's keep the central imputation
df = df_central.copy()
df_central.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8998 entries, 0 to 8997 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 8998 non-null int64 1 income 8998 non-null float64 2 frq 8998 non-null int64 3 rcn 8998 non-null int64 4 mnt 8998 non-null int64 5 clothes 8998 non-null int64 6 kitchen 8998 non-null int64 7 small_appliances 8998 non-null int64 8 toys 8998 non-null int64 9 house_keeping 8998 non-null int64 10 dependents 8998 non-null boolean 11 per_net_purchase 8998 non-null int64 12 gender 8998 non-null object 13 education 8998 non-null object 14 status 8998 non-null object 15 description 8998 non-null object dtypes: boolean(1), float64(1), int64(10), object(4) memory usage: 1.0+ MB
An overview of our previous data exploration¶
You can also explore this dataset using the exported pandas-profiling report.





Outlier removal¶
Why do we need to remove outliers? Which methods can we use?
Outlier removal using only the IQR method¶
Why should you use/not use this method?
Inter-Quartile Range formula:
$IQR = Q_3 – Q_1$
Outliers are the observations that fall
- below $Q_1 − 1.5 \times IQR$
or
- above $Q_3 + 1.5 \times IQR$
# Compute the IQR
# HINT: Use .quantile()
# Calculate Q1, Q3, IQR
q25 = q1= df[metric_features].quantile(.25)
q75 = q3 = df[metric_features].quantile(.75)
iqr = (q75 - q25)
# Compute upper and lower limit
# lower_limit = Q1 -1.5*IQR
upper_lim = q75 + 1.5 * iqr
# upper_limit = Q3 + 1.5*IQR
lower_lim = q25 - 1.5 * iqr
filters_iqr = []
for metric in metric_features:
llim = lower_lim[metric]
ulim = upper_lim[metric]
filters_iqr.append(df[metric].between(llim, ulim, inclusive='neither'))
filters_iqr_all = pd.Series(np.all(filters_iqr, 0))
filters_iqr_all
0 True
1 True
2 False
3 True
4 True
...
8993 True
8994 True
8995 True
8996 True
8997 False
Length: 8998, dtype: bool
# filters_iqr
# filters_iqr_all
filters_iqr_all.sum()
7339
df_iqr = df[filters_iqr_all]
print('Percentage of data kept after removing outliers:', 100*(np.round(df_iqr.shape[0] / df_original.shape[0], 4)))
Percentage of data kept after removing outliers: 81.56
What do you think of this number?
$20\%$ of the data is considered an outlier $\rightarrow$ Para a professora, é um valor muito alto. (No máximo $5\%$)
In general we want to keep as much of our data as possible.
As a rule of thumb, try not to remove more than 5% of your rows.
*This is only a rule of thumb!*
In some cases it really is necessary to remove more.
Manual outlier removal¶
Now let's try "manually" filtering the dataset's outliers
Let's look at the boxplots again:

# This may vary from session to session, and is prone to varying interpretations.
# A simple example is provided below:
# ( df['feature1']<= limit1 ) & ( df['feature2']<= limit2 ) ...
filters_man = (
# ( df['feature1']<= limit1 )
# & ( df['feature2']<= limit2 )
# ...
(df['house_keeping']<=50)
&
(df['kitchen']<=40)
&
(df['toys']<=35)
&
(df['education']!='OldSchool')
)
df_man = df[filters_man]
print('Percentage of data kept after removing outliers:', 100*np.round(df_man.shape[0] / df_original.shape[0], 4))
Percentage of data kept after removing outliers: 97.99
Combining different outlier methods¶
More robust/ consistent outlier detection method:
## What is this doing?
df_out = df[(filters_iqr_all | filters_man)]
print('Percentage of data kept after removing outliers:', np.round(df_out.shape[0] / df_original.shape[0], 4))
Percentage of data kept after removing outliers: 0.9819
# Get the manual filtering version
df = df_man.copy()
Outlier threshold value¶
You may change the values of observations for a given variable:
# Remember the 'rcn' variable had an odd behavior?
# This is the time to fix that
df['birth_year'] = df['age']
df['age'] = datetime.now().year - df['birth_year']
df['spent_online'] = (df['per_net_purchase'] / 100) * df['mnt']
print((df['rcn']>100).value_counts())
rcn_t = df['rcn'].copy()
rcn_t.loc[rcn_t>100] = 100
df['rcn'] = rcn_t
rcn False 8437 True 380 Name: count, dtype: int64
# Boxplot before and after
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
sns.boxplot(data=df_original[['rcn']], ax=ax[0])
ax[0].set_title('Before')
sns.boxplot(data=df[['rcn']], ax=ax[1])
ax[1].set_title('After')
plt.show()
sns.set_style('whitegrid')
frows = 2
fcols = ceil(len(metric_features)/frows)
fig = plt.figure(figsize=(20,8))
subfigs = fig.subfigures(frows, fcols, wspace=.03, hspace=.03)
for sfig, feat in zip(subfigs.flatten(), metric_features):
axes = sfig.subplots(2, 1, sharex=True)
sns.boxplot(x=df_original[feat], ax=axes[0])
axes[0].set_ylabel('Original')
axes[0].set_title(feat, fontsize='large')
sns.boxplot(x=df[feat], ax=axes[1])
axes[1].set_ylabel('Outliers\nRemoved')
axes[1].set_xlabel('')
# Subfigure Aesthetics
sfig.set_facecolor("#F9F9F9")
sfig.subplots_adjust(left=.2, right=.95, bottom=.1,)
plt.savefig(os.path.join('..', 'figures', 'eda', 'boxplots_before_after_outliers.png'), dpi=200)
plt.show()
sns.set()
Feature Engineering¶
A reminder of our metadata:
- id - The unique identifier of the customer
- age - The year of birht of the customer
- income - The income of the customer
- frq - Frequency: number of purchases made by the customer
- rcn - Recency: number of days since last customer purchase
- mnt - Monetary: amount of € spent by the customer in purchases
- clothes - Number of clothes items purchased by the customer
- kitchen - Number of kitchen items purchased by the customer
- small_appliances - Number of small_appliances items purchased by the customer
- toys - Number of toys items purchased by the customer
- house_keeping - Number of house_keeping items purchased by the customer
- dependents - Binary. Whether or not the customer has dependents
- per_net_purchase - Percentage of purchases made online
- education - Education level of the customer
- status - Marital status of the customer
- gender - Gender of the customer
- description - Last customer's recommendation description
# Try to pull as many features as you can; You can always drop them later on
# You can start by converting the 'age' variable to the actual age
# You can also store the birth year in another feature
new_features = []
df['birth_year'] = df['age']
df['age'] = 2024 - df['birth_year']
new_features.append('birth_year')
new_features
['birth_year']
df['spent_online'] = df['per_net_purchase']*df['mnt']/100
new_features.append('spent_online')
new_features
['birth_year', 'spent_online']
df.describe()
| age | income | frq | rcn | mnt | clothes | kitchen | small_appliances | toys | house_keeping | per_net_purchase | birth_year | spent_online | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 | 8817.000000 |
| mean | 1965.664739 | 70534.387887 | 19.990360 | 52.220937 | 630.162640 | 51.181127 | 6.776908 | 28.673925 | 6.581263 | 6.764319 | 42.241239 | 58.335261 | 178.378072 |
| std | 17.075958 | 27246.776600 | 10.918369 | 30.098895 | 647.834648 | 22.984454 | 7.310850 | 12.595621 | 6.895266 | 7.540019 | 18.482272 | 17.075958 | 168.577960 |
| min | 1936.000000 | 10000.000000 | 3.000000 | 0.000000 | 6.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 4.000000 | 28.000000 | 2.730000 |
| 25% | 1951.000000 | 48842.000000 | 10.000000 | 26.000000 | 64.000000 | 34.000000 | 2.000000 | 19.000000 | 2.000000 | 2.000000 | 27.000000 | 44.000000 | 34.650000 |
| 50% | 1966.000000 | 70388.000000 | 18.000000 | 53.000000 | 387.000000 | 52.000000 | 4.000000 | 28.000000 | 4.000000 | 4.000000 | 45.000000 | 58.000000 | 140.000000 |
| 75% | 1980.000000 | 92413.000000 | 28.000000 | 78.000000 | 1079.000000 | 70.000000 | 9.000000 | 37.000000 | 9.000000 | 9.000000 | 57.000000 | 73.000000 | 265.100000 |
| max | 1996.000000 | 140628.000000 | 59.000000 | 100.000000 | 3052.000000 | 99.000000 | 40.000000 | 74.000000 | 35.000000 | 50.000000 | 88.000000 | 88.000000 | 1416.240000 |
Variable selection: Redundancy VS Relevancy¶
Depois de criarmos as novas variáveis, vamos verificar a correlação entre elas e verficar se temos variáveis redundantes.
incomeeage|mntefrqsão variáveis redundantes- Pq q removemos? Pq vamos dar mais peso à mesma informação
- Qual vamos remover? EDA é importante para decidir
- (p.e., quando analisamos os boxplots, podemos ver que a mnt e frq e ver se têm outliers ->
frqmelhor/ quando analisamos os histogramas, podemos procurar por distribuições normais -> Neste casofrqé ligeiramente mais normal quemnt) - (p.e. quando vemos a
ageeincomepodemos ver que aincometem uma distribuição mais normal que aage, masincometem missing values e ambos não têm outliers -> Neste caso tmb podemos pensar qual será a variável mais fácil de obter corretamente dos dados para o negócio -> Ela optou por remover aage) - Também podemos pensar qual será a mais fácil de explicar no cluster ->
ageé mais fácil de explicar que aincome-> Umas são mais fáceis de ser para serem INPUT do clustering e outras para PROFILE
- (p.e., quando analisamos os boxplots, podemos ver que a mnt e frq e ver se têm outliers ->
Quando fazemos a Feature Engineering temos de fazer a matriz de correlação again para ver se temos novas variáveis redundantes
# Select variables according to their correlations
# Updating metric_features
## RUN THIS CELL ONLY ONCE!!!!
# CODE HERE
metric_features.append("spent_online")
metric_features.remove("mnt")
metric_features.remove("age")
metric_features
['income', 'frq', 'rcn', 'clothes', 'kitchen', 'small_appliances', 'toys', 'house_keeping', 'per_net_purchase', 'spent_online']
# You can also drop the features from the df
# But sometimes they may be useful for cluster profiling later
# df.drop(..., inplace=True)
Relevancy¶
Selecting variables based on the relevancy of each one to the task.
Example:
- remove uncorrelated variables with the target,
- stepwise regression,
- use variables for product clustering,
- use variables for socio-demographic clustering,
- ...
Variables that aren't correlated with any other variable are often also not relevant. In this case we will not focus on this a lot since we don't have a defined task yet.
Redo data exploration¶
Check if the data looks the way you expect it to.
- Have you missed some outliers?
- Are there still missing values?
- Is the data normalized?
This is an iterative process. It is likely you will change your preprocessing steps frequently throughout your group work.
Data Normalization¶
# Remember our original data
df.describe().round(2)
| age | income | frq | rcn | mnt | clothes | kitchen | small_appliances | toys | house_keeping | per_net_purchase | birth_year | spent_online | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 |
| mean | 1965.66 | 70534.39 | 19.99 | 52.22 | 630.16 | 51.18 | 6.78 | 28.67 | 6.58 | 6.76 | 42.24 | 58.34 | 178.38 |
| std | 17.08 | 27246.78 | 10.92 | 30.10 | 647.83 | 22.98 | 7.31 | 12.60 | 6.90 | 7.54 | 18.48 | 17.08 | 168.58 |
| min | 1936.00 | 10000.00 | 3.00 | 0.00 | 6.00 | 1.00 | 0.00 | 1.00 | 0.00 | 0.00 | 4.00 | 28.00 | 2.73 |
| 25% | 1951.00 | 48842.00 | 10.00 | 26.00 | 64.00 | 34.00 | 2.00 | 19.00 | 2.00 | 2.00 | 27.00 | 44.00 | 34.65 |
| 50% | 1966.00 | 70388.00 | 18.00 | 53.00 | 387.00 | 52.00 | 4.00 | 28.00 | 4.00 | 4.00 | 45.00 | 58.00 | 140.00 |
| 75% | 1980.00 | 92413.00 | 28.00 | 78.00 | 1079.00 | 70.00 | 9.00 | 37.00 | 9.00 | 9.00 | 57.00 | 73.00 | 265.10 |
| max | 1996.00 | 140628.00 | 59.00 | 100.00 | 3052.00 | 99.00 | 40.00 | 74.00 | 35.00 | 50.00 | 88.00 | 88.00 | 1416.24 |
MinMax Scaling¶
Transforms values to be between [0,1]
$$ x' = \frac{x - min(X)}{max(X) - min(X)}$$where:
- $x$ is an original value
- $x'$ is the normalized value
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
## Let's look at an example:
demo_ages = pd.DataFrame({"Age": [14, 24, 22, 18, 18, 26, 11, 23, 13, 12]})
demo_ages
| Age | |
|---|---|
| 0 | 14 |
| 1 | 24 |
| 2 | 22 |
| 3 | 18 |
| 4 | 18 |
| 5 | 26 |
| 6 | 11 |
| 7 | 23 |
| 8 | 13 |
| 9 | 12 |
# How to do this with Python/Pandas?
# ( demo_ages - min(demo_ages) )
# ---------------------------------
# max(demo_ages) - min(demo_ages)
demo_ages
| Age | |
|---|---|
| 0 | 14 |
| 1 | 24 |
| 2 | 22 |
| 3 | 18 |
| 4 | 18 |
| 5 | 26 |
| 6 | 11 |
| 7 | 23 |
| 8 | 13 |
| 9 | 12 |
min_age = demo_ages["Age"].min()
max_age = demo_ages["Age"].max()
mm_age = (demo_ages["Age"]-min_age )/(max_age-min_age)
pd.DataFrame(mm_age)
| Age | |
|---|---|
| 0 | 0.200000 |
| 1 | 0.866667 |
| 2 | 0.733333 |
| 3 | 0.466667 |
| 4 | 0.466667 |
| 5 | 1.000000 |
| 6 | 0.000000 |
| 7 | 0.800000 |
| 8 | 0.133333 |
| 9 | 0.066667 |
Remember the formula:
$$x' = \frac{x - min(X)}{max(X) - min(X)}$$where:
- $x$ is an original value
- $x'$ is the normalized value
(This is not showing all rows)
| Age (Orig) | Formula | Result |
|---|---|---|
| 14 | (14 - 11) / 15 | 0.2 |
| 24 | (24 - 11) / 15 | 0.86666667 |
| 26 | (26 - 11) / 15 | 1 |
| 11 | (11 - 11) / 15 | 0 |
# sklearn makes it even easier:
demo_ages["Age_minmax"] = MinMaxScaler().fit_transform(demo_ages[["Age"]])
demo_ages
| Age | Age_minmax | |
|---|---|---|
| 0 | 14 | 0.200000 |
| 1 | 24 | 0.866667 |
| 2 | 22 | 0.733333 |
| 3 | 18 | 0.466667 |
| 4 | 18 | 0.466667 |
| 5 | 26 | 1.000000 |
| 6 | 11 | 0.000000 |
| 7 | 23 | 0.800000 |
| 8 | 13 | 0.133333 |
| 9 | 12 | 0.066667 |
Now do it for our dataset¶
df_minmax = df.copy()
# Use MinMaxScaler to scale the data
mm_scaler = MinMaxScaler()
mm_scaled_feat = mm_scaler.fit_transform(df[metric_features])
mm_scaled_feat
array([[0.61841259, 0.53571429, 0.66 , ..., 0.06 , 0.17857143,
0.1865215 ],
[0.78867471, 0.51785714, 0.06 , ..., 0.04 , 0.05952381,
0.0959314 ],
[0.14042931, 0.14285714, 0.69 , ..., 0.48 , 0.6547619 ,
0.01643427],
...,
[0.36838197, 0.16071429, 0.06 , ..., 0.12 , 0.79761905,
0.02870868],
[0.33906972, 0.46428571, 0.72 , ..., 0.22 , 0.32142857,
0.21979328],
[0.88772698, 0.625 , 0.75 , ..., 0.12 , 0.48809524,
0.52908717]])
# See what the fit method is doing (notice the trailing underscore):
print("Parameters fitted:\n")
pd.DataFrame(
[mm_scaler.data_min_, mm_scaler.data_max_],
columns=metric_features,
index=['min','max'])
Parameters fitted:
| income | frq | rcn | clothes | kitchen | small_appliances | toys | house_keeping | per_net_purchase | spent_online | |
|---|---|---|---|---|---|---|---|---|---|---|
| min | 10000.0 | 3.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 4.0 | 2.73 |
| max | 140628.0 | 59.0 | 100.0 | 99.0 | 40.0 | 74.0 | 35.0 | 50.0 | 88.0 | 1416.24 |
## Replace original metric_features values with mm_scaled_feat values
df_minmax[metric_features] = mm_scaled_feat
# Checking max and min of minmaxed variables
df_minmax[metric_features].describe().round(2)
| income | frq | rcn | clothes | kitchen | small_appliances | toys | house_keeping | per_net_purchase | spent_online | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 |
| mean | 0.46 | 0.30 | 0.52 | 0.51 | 0.17 | 0.38 | 0.19 | 0.14 | 0.46 | 0.12 |
| std | 0.21 | 0.19 | 0.30 | 0.23 | 0.18 | 0.17 | 0.20 | 0.15 | 0.22 | 0.12 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 0.30 | 0.12 | 0.26 | 0.34 | 0.05 | 0.25 | 0.06 | 0.04 | 0.27 | 0.02 |
| 50% | 0.46 | 0.27 | 0.53 | 0.52 | 0.10 | 0.37 | 0.11 | 0.08 | 0.49 | 0.10 |
| 75% | 0.63 | 0.45 | 0.78 | 0.70 | 0.22 | 0.49 | 0.26 | 0.18 | 0.63 | 0.19 |
| max | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
df_minmax[metric_features].head()
| income | frq | rcn | clothes | kitchen | small_appliances | toys | house_keeping | per_net_purchase | spent_online | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.618413 | 0.535714 | 0.66 | 0.367347 | 0.125 | 0.589041 | 0.285714 | 0.06 | 0.178571 | 0.186521 |
| 1 | 0.788675 | 0.517857 | 0.06 | 0.551020 | 0.025 | 0.506849 | 0.114286 | 0.04 | 0.059524 | 0.095931 |
| 2 | 0.140429 | 0.142857 | 0.69 | 0.316327 | 0.475 | 0.315068 | 0.028571 | 0.48 | 0.654762 | 0.016434 |
| 3 | 0.639763 | 0.410714 | 0.10 | 0.602041 | 0.250 | 0.246575 | 0.171429 | 0.10 | 0.369048 | 0.217947 |
| 4 | 0.626604 | 0.500000 | 0.26 | 0.591837 | 0.125 | 0.369863 | 0.114286 | 0.08 | 0.357143 | 0.271799 |
df[metric_features].head()
| income | frq | rcn | clothes | kitchen | small_appliances | toys | house_keeping | per_net_purchase | spent_online | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 90782.0 | 33 | 66 | 37 | 5 | 44 | 10 | 3 | 19 | 266.38 |
| 1 | 113023.0 | 32 | 6 | 55 | 1 | 38 | 4 | 2 | 9 | 138.33 |
| 2 | 28344.0 | 11 | 69 | 32 | 19 | 24 | 1 | 24 | 59 | 25.96 |
| 3 | 93571.0 | 26 | 10 | 60 | 10 | 19 | 6 | 5 | 35 | 310.80 |
| 4 | 91852.0 | 31 | 26 | 59 | 5 | 28 | 4 | 4 | 34 | 386.92 |
Standard Scaling (AKA Z-Score Scaling)¶
Standardize features by removing the mean and scaling to unit variance.
The standard score of a sample x is calculated as:
$$z = (x - u) / s$$where:
$u$ is the mean of the training samples,
$s$ is the standard deviation of the training samples
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
## Let's see the age demo again
demo_ages
| Age | Age_minmax | |
|---|---|---|
| 0 | 14 | 0.200000 |
| 1 | 24 | 0.866667 |
| 2 | 22 | 0.733333 |
| 3 | 18 | 0.466667 |
| 4 | 18 | 0.466667 |
| 5 | 26 | 1.000000 |
| 6 | 11 | 0.000000 |
| 7 | 23 | 0.800000 |
| 8 | 13 | 0.133333 |
| 9 | 12 | 0.066667 |
where:
- $u$ is the mean of the training samples,
- $s$ is the standard deviation of the training samples
## u = mean of age
## s = std of age
# How to do this in Python/Pandas?
# ( demo_ages - mean(demo_ages) )
# -------------------------------
# std(demo_ages)
## u = mean of age
## s = std of age
mean_age = demo_ages["Age"].mean()
std_age = demo_ages["Age"].std(ddof = 0) # to match sklearn implementation of std
# ddof : int, default 1
# Delta degrees of freedom. The divisor used in calculations
# is ``N - ddof``, where ``N`` represents the number of elements.
pd.DataFrame((demo_ages["Age"] - mean_age) / std_age)
| Age | |
|---|---|
| 0 | -0.793614 |
| 1 | 1.142031 |
| 2 | 0.754902 |
| 3 | -0.019356 |
| 4 | -0.019356 |
| 5 | 1.529160 |
| 6 | -1.374308 |
| 7 | 0.948466 |
| 8 | -0.987179 |
| 9 | -1.180743 |
# Or use sklearn:
demo_ages["Age_standard"] = StandardScaler().fit_transform(demo_ages[["Age"]])
demo_ages
| Age | Age_minmax | Age_standard | |
|---|---|---|---|
| 0 | 14 | 0.200000 | -0.793614 |
| 1 | 24 | 0.866667 | 1.142031 |
| 2 | 22 | 0.733333 | 0.754902 |
| 3 | 18 | 0.466667 | -0.019356 |
| 4 | 18 | 0.466667 | -0.019356 |
| 5 | 26 | 1.000000 | 1.529160 |
| 6 | 11 | 0.000000 | -1.374308 |
| 7 | 23 | 0.800000 | 0.948466 |
| 8 | 13 | 0.133333 | -0.987179 |
| 9 | 12 | 0.066667 | -1.180743 |
demo_ages.describe().round(2)
| Age | Age_minmax | Age_standard | |
|---|---|---|---|
| count | 10.00 | 10.00 | 10.00 |
| mean | 18.10 | 0.47 | -0.00 |
| std | 5.45 | 0.36 | 1.05 |
| min | 11.00 | 0.00 | -1.37 |
| 25% | 13.25 | 0.15 | -0.94 |
| 50% | 18.00 | 0.47 | -0.02 |
| 75% | 22.75 | 0.78 | 0.90 |
| max | 26.00 | 1.00 | 1.53 |
Now do it for our dataset¶
df_standard = df.copy()
ss_scaler = StandardScaler() # finish this line
ss_scaled_feat = ss_scaler.fit_transform(df[metric_features]) # finish this line
ss_scaled_feat
array([[ 0.74316169, 1.1916046 , 0.45781896, ..., -0.49927359,
-1.25755953, 0.52205471],
[ 1.55948802, 1.10001063, -1.53572277, ..., -0.63190677,
-1.79864925, -0.23757758],
[-1.5485421 , -0.82346271, 0.55749604, ..., 2.28602317,
0.90679936, -0.90419125],
...,
[-0.4556169 , -0.73186874, -1.53572277, ..., -0.10137405,
1.55610702, -0.80126567],
[-0.59615525, 0.82522873, 0.65717313, ..., 0.56179185,
-0.60825186, 0.80105125],
[ 2.03439682, 1.64957444, 0.75685022, ..., -0.10137405,
0.14927375, 3.39459784]])
# See what the fit method is doing (notice the trailing underscore):
print("Parameters fitted:\n")
pd.DataFrame([ss_scaler.mean_, np.sqrt(ss_scaler.var_)], columns=metric_features, index=['mean','std'])
Parameters fitted:
| income | frq | rcn | clothes | kitchen | small_appliances | toys | house_keeping | per_net_purchase | spent_online | |
|---|---|---|---|---|---|---|---|---|---|---|
| mean | 70534.387887 | 19.990360 | 52.220937 | 51.181127 | 6.776908 | 28.673925 | 6.581263 | 6.764319 | 42.241239 | 178.378072 |
| std | 27245.231428 | 10.917749 | 30.097188 | 22.983151 | 7.310435 | 12.594907 | 6.894875 | 7.539592 | 18.481223 | 168.568400 |
df_standard[metric_features] = ss_scaled_feat
df_standard.head()
| age | income | frq | rcn | mnt | clothes | kitchen | small_appliances | toys | house_keeping | dependents | per_net_purchase | gender | education | status | description | birth_year | spent_online | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1946 | 0.743162 | 1.191605 | 0.457819 | 1402 | -0.617023 | -0.243065 | 1.216847 | 0.495837 | -0.499274 | False | -1.257560 | M | Graduation | Together | Take my money!! | 78 | 0.522055 |
| 1 | 1936 | 1.559488 | 1.100011 | -1.535723 | 1537 | 0.166160 | -0.790228 | 0.740464 | -0.374374 | -0.631907 | False | -1.798649 | F | PhD | Divorced | Take my money!! | 88 | -0.237578 |
| 2 | 1990 | -1.548542 | -0.823463 | 0.557496 | 44 | -0.834573 | 1.672006 | -0.371096 | -0.809480 | 2.286023 | True | 0.906799 | M | Graduation | Married | Kind of OK | 34 | -0.904191 |
| 3 | 1955 | 0.845528 | 0.550447 | -1.402820 | 888 | 0.383710 | 0.440889 | -0.768082 | -0.084304 | -0.234007 | True | -0.391816 | F | Master | Married | OK nice! | 69 | 0.785568 |
| 4 | 1955 | 0.782435 | 1.008417 | -0.871209 | 1138 | 0.340200 | -0.243065 | -0.053508 | -0.374374 | -0.366640 | True | -0.445925 | F | Graduation | Together | Take my money!! | 69 | 1.237135 |
# Checking mean and variance of standardized variables
df_standard[metric_features].describe().round(2)
| income | frq | rcn | clothes | kitchen | small_appliances | toys | house_keeping | per_net_purchase | spent_online | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 | 8817.00 |
| mean | -0.00 | -0.00 | 0.00 | -0.00 | -0.00 | -0.00 | 0.00 | 0.00 | -0.00 | -0.00 |
| std | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| min | -2.22 | -1.56 | -1.74 | -2.18 | -0.93 | -2.20 | -0.95 | -0.90 | -2.07 | -1.04 |
| 25% | -0.80 | -0.92 | -0.87 | -0.75 | -0.65 | -0.77 | -0.66 | -0.63 | -0.82 | -0.85 |
| 50% | -0.01 | -0.18 | 0.03 | 0.04 | -0.38 | -0.05 | -0.37 | -0.37 | 0.15 | -0.23 |
| 75% | 0.80 | 0.73 | 0.86 | 0.82 | 0.30 | 0.66 | 0.35 | 0.30 | 0.80 | 0.51 |
| max | 2.57 | 3.57 | 1.59 | 2.08 | 4.54 | 3.60 | 4.12 | 5.73 | 2.48 | 7.34 |
Important: What if we had a training and test set? Should we fit a Scaler in both? What about other Sklearn objects?
Yes, we should
fitthe Scaler in thetraining setandtransformboth thetraining and test set.
Compare MinMaxScaler vs StandardScaler vs Original data¶
Boxplots¶
sns.set_style('whitegrid')
fig, axes = plt.subplots(2,3, figsize=(8,5), tight_layout=True, sharex='col', sharey='row')
bp_feat_l = 'house_keeping'
sns.boxplot(df_original, x=bp_feat_l, ax=axes[0][0], width=.4)
axes[0][0].set_title('Original')
axes[0][0].set_ylabel(bp_feat_l)
sns.boxplot(df_minmax, x=bp_feat_l, ax=axes[0][1], width=.4)
axes[0][1].set_title('MinMaxScaler()')
sns.boxplot(df_standard, x=bp_feat_l, ax=axes[0][2], width=.4)
axes[0][2].set_title('StandardScaler()')
bp_feat_r = 'income'
sns.boxplot(df_original, x=bp_feat_r, ax=axes[1][0], width=.4)
axes[1][0].set_ylabel(bp_feat_r)
sns.boxplot(df_minmax, x=bp_feat_r, ax=axes[1][1], width=.4)
sns.boxplot(df_standard, x=bp_feat_r, ax=axes[1][2], width=.4)
axes[1][0].set_xlabel(None)
axes[1][1].set_xlabel(None)
axes[1][2].set_xlabel(None)
fig.suptitle('Boxplots: "{}" and "{}"'.format(bp_feat_l, bp_feat_r))
plt.show()
Neste caso deviamos usar o
StandardScalerpq apesar de termos outliers, aStandardScalermantém simétrica a distribuição dos dados.
Os outliers são mantidos, mas dependedo do scaler, a distribuição é diferente.
sns.set_style('white')
fig, axes = plt.subplots(3,1, figsize=(8,5), tight_layout=True,)
sns.boxplot(df_standard, x=bp_feat_l, ax=axes[0], width=.4)
sns.boxplot(df_minmax, x=bp_feat_l, ax=axes[1], width=.4)
sns.boxplot(df, x=bp_feat_l, ax=axes[2], width=.4)
axes[0].set_title("standard")
axes[1].set_title("mm")
axes[2].set_title("origi")
plt.show()
Histograms¶
What about histograms?
Exercise: Visualize histograms here, compare the shapes of the original distribution with the MinMaxScaler and StandardScaler results.
sns.set_style('white')
# Create subplots with 3 rows, 1 column:
fig, axes = plt.subplots(3,1,
figsize=(5,5), tight_layout=True)
# Which feature do you want to visualize?
hp_feat = 'house_keeping'
# common function args for histplot
hp_args = dict(x=hp_feat, bins=15)
sns.histplot(df_original, ax=axes[0], **hp_args)
axes[0].set_title('{}: Original'.format(hp_feat))
axes[0].set_xlabel(None)
sns.histplot(df_minmax, ax=axes[1], **hp_args)
axes[1].set_title('{}: MinMaxScaler()'.format(hp_feat))
axes[1].set_xlabel(None)
sns.histplot(df_standard, ax=axes[2], **hp_args)
axes[2].set_title('{}: StandardScaler()'.format(hp_feat))
axes[2].set_xlabel(None)
plt.show()
sns.set()
Não põe magicamente a distribuição normal, mas muda a escala dos dados.
# sns.set()
Keep StandardScaler¶
df = df_standard.copy()
One-Hot Encoding¶
df_ohc = df.copy()
# First let's remove status=Whatever
df_ohc.loc[df_ohc['status'] == 'Whatever', 'status'] = df['status'].mode()[0]
# Use OneHotEncoder to encode the categorical features. Get feature names and create a DataFrame
# with the one-hot encoded categorical features (pass feature names)
ohc = OneHotEncoder(sparse_output=False, drop="first")
ohc_feat = ohc.fit_transform(df_ohc[non_metric_features])
ohc_feat_names = ohc.get_feature_names_out()
# Get feature names and create a DataFrame
# with the one-hot encoded categorical features (pass feature names)
ohc_feat = ohc.fit_transform(df_ohc[non_metric_features])
ohc_feat_names = ohc.get_feature_names_out()
ohc_df = pd.DataFrame(ohc_feat, index=df_ohc.index, columns=ohc_feat_names) # Why the index=df_ohc.index?
ohc_df
| education_2nd Cycle | education_Graduation | education_Master | education_PhD | status_Married | status_Single | status_Together | status_Widow | gender_M | dependents_1.0 | description_Kind of OK | description_Meh... | description_OK nice! | description_Take my money!! | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 2 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 3 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8992 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 8993 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 8994 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 8995 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 8996 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
8817 rows × 14 columns
# Reassigning df to contain ohc variables
df_ohc = pd.concat([df_ohc, ohc_df], axis=1)
df_ohc.head()
| age | income | frq | rcn | mnt | clothes | kitchen | small_appliances | toys | house_keeping | ... | status_Married | status_Single | status_Together | status_Widow | gender_M | dependents_1.0 | description_Kind of OK | description_Meh... | description_OK nice! | description_Take my money!! | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1946 | 0.743162 | 1.191605 | 0.457819 | 1402 | -0.617023 | -0.243065 | 1.216847 | 0.495837 | -0.499274 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 1936 | 1.559488 | 1.100011 | -1.535723 | 1537 | 0.166160 | -0.790228 | 0.740464 | -0.374374 | -0.631907 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 2 | 1990 | -1.548542 | -0.823463 | 0.557496 | 44 | -0.834573 | 1.672006 | -0.371096 | -0.809480 | 2.286023 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 3 | 1955 | 0.845528 | 0.550447 | -1.402820 | 888 | 0.383710 | 0.440889 | -0.768082 | -0.084304 | -0.234007 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 1955 | 0.782435 | 1.008417 | -0.871209 | 1138 | 0.340200 | -0.243065 | -0.053508 | -0.374374 | -0.366640 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
5 rows × 32 columns
df = df_ohc.copy()
list(df.columns)
['age', 'income', 'frq', 'rcn', 'mnt', 'clothes', 'kitchen', 'small_appliances', 'toys', 'house_keeping', 'dependents', 'per_net_purchase', 'gender', 'education', 'status', 'description', 'birth_year', 'spent_online', 'education_2nd Cycle', 'education_Graduation', 'education_Master', 'education_PhD', 'status_Married', 'status_Single', 'status_Together', 'status_Widow', 'gender_M', 'dependents_1.0', 'description_Kind of OK', 'description_Meh...', 'description_OK nice!', 'description_Take my money!!']
df_ohc[non_metric_features]
| education | status | gender | dependents | description | |
|---|---|---|---|---|---|
| 0 | Graduation | Together | M | False | Take my money!! |
| 1 | PhD | Divorced | F | False | Take my money!! |
| 2 | Graduation | Married | M | True | Kind of OK |
| 3 | Master | Married | F | True | OK nice! |
| 4 | Graduation | Together | F | True | Take my money!! |
| ... | ... | ... | ... | ... | ... |
| 8992 | Graduation | Married | M | True | Kind of OK |
| 8993 | 1st Cycle | Single | F | True | Take my money!! |
| 8994 | 2nd Cycle | Single | M | True | Meh... |
| 8995 | Graduation | Together | M | False | Take my money!! |
| 8996 | 2nd Cycle | Married | M | True | Take my money!! |
8817 rows × 5 columns
df_ohc["education"].unique() # 5 categories -> 4 dummies
array(['Graduation', 'PhD', 'Master', '2nd Cycle', '1st Cycle'],
dtype=object)
ohc_df
| education_2nd Cycle | education_Graduation | education_Master | education_PhD | status_Married | status_Single | status_Together | status_Widow | gender_M | dependents_1.0 | description_Kind of OK | description_Meh... | description_OK nice! | description_Take my money!! | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 2 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 3 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8992 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 8993 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 8994 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 8995 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 8996 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
8817 rows × 14 columns
Nem sempre o
One-Hot Encodingé a melhor opção. Depende do conhecimento dos dados que temos. $\rightarrow$Label Encodingseria a melhor opção aqui provavelmente.
# You can also choose to drop the original non_metric_features from your df
# df.drop(columns=non_metric_features)
Rename OHE columns¶
## Rename OHE columns from "feature_val_a" to "oh_feature_val_a"
## e.g. status_Widow to oh_status_Widow
## We do this to be able to distinguish the OHE columns more easily later
## Assemble OHE columns and their new column names
rename_ohe_cols = {}
for i in non_metric_features:
for j in df.columns[df.columns.str.startswith(i)].to_list() :
if j not in non_metric_features:
rename_ohe_cols[j] = 'oh_' + j
df.rename(columns=rename_ohe_cols, inplace=True)
df
| age | income | frq | rcn | mnt | clothes | kitchen | small_appliances | toys | house_keeping | ... | oh_status_Married | oh_status_Single | oh_status_Together | oh_status_Widow | oh_gender_M | oh_dependents_1.0 | oh_description_Kind of OK | oh_description_Meh... | oh_description_OK nice! | oh_description_Take my money!! | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1946 | 0.743162 | 1.191605 | 0.457819 | 1402 | -0.617023 | -0.243065 | 1.216847 | 0.495837 | -0.499274 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 1936 | 1.559488 | 1.100011 | -1.535723 | 1537 | 0.166160 | -0.790228 | 0.740464 | -0.374374 | -0.631907 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 2 | 1990 | -1.548542 | -0.823463 | 0.557496 | 44 | -0.834573 | 1.672006 | -0.371096 | -0.809480 | 2.286023 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 3 | 1955 | 0.845528 | 0.550447 | -1.402820 | 888 | 0.383710 | 0.440889 | -0.768082 | -0.084304 | -0.234007 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 1955 | 0.782435 | 1.008417 | -0.871209 | 1138 | 0.340200 | -0.243065 | -0.053508 | -0.374374 | -0.366640 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8992 | 1954 | 0.618993 | 0.458853 | -1.701851 | 837 | 0.209670 | 0.167308 | -0.132905 | 0.205767 | -0.764540 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 8993 | 1960 | 0.874744 | 0.733635 | -1.701851 | 896 | 0.731791 | -0.243065 | -0.609288 | -0.519409 | -0.366640 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 8994 | 1975 | -0.455617 | -0.731869 | -1.535723 | 61 | 0.079139 | -0.106274 | -0.053508 | 0.060732 | -0.101374 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 8995 | 1986 | -0.596155 | 0.825229 | 0.657173 | 1011 | -0.442982 | 0.577680 | 0.581670 | -0.809480 | 0.561792 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 8996 | 1938 | 2.034397 | 1.649574 | 0.756850 | 1668 | 0.427220 | -0.653437 | -0.291699 | -0.229339 | -0.101374 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
8817 rows × 32 columns
list(df.columns)
['age', 'income', 'frq', 'rcn', 'mnt', 'clothes', 'kitchen', 'small_appliances', 'toys', 'house_keeping', 'dependents', 'per_net_purchase', 'gender', 'education', 'status', 'description', 'birth_year', 'spent_online', 'oh_education_2nd Cycle', 'oh_education_Graduation', 'oh_education_Master', 'oh_education_PhD', 'oh_status_Married', 'oh_status_Single', 'oh_status_Together', 'oh_status_Widow', 'oh_gender_M', 'oh_dependents_1.0', 'oh_description_Kind of OK', 'oh_description_Meh...', 'oh_description_OK nice!', 'oh_description_Take my money!!']
Redo data exploration¶
Check if the data looks the way you expect it to.
- Have you missed some outliers?
- Are there still missing values?
- Is the data normalized?
This is an iterative process. It is likely you will change your preprocessing steps frequently throughout your group work.
metric_features
['income', 'frq', 'rcn', 'clothes', 'kitchen', 'small_appliances', 'toys', 'house_keeping', 'per_net_purchase', 'spent_online']
non_metric_features
['education', 'status', 'gender', 'dependents', 'description']
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 8817 entries, 0 to 8996 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 8817 non-null int64 1 income 8817 non-null float64 2 frq 8817 non-null float64 3 rcn 8817 non-null float64 4 mnt 8817 non-null int64 5 clothes 8817 non-null float64 6 kitchen 8817 non-null float64 7 small_appliances 8817 non-null float64 8 toys 8817 non-null float64 9 house_keeping 8817 non-null float64 10 dependents 8817 non-null boolean 11 per_net_purchase 8817 non-null float64 12 gender 8817 non-null object 13 education 8817 non-null object 14 status 8817 non-null object 15 description 8817 non-null object 16 birth_year 8817 non-null int64 17 spent_online 8817 non-null float64 18 oh_education_2nd Cycle 8817 non-null float64 19 oh_education_Graduation 8817 non-null float64 20 oh_education_Master 8817 non-null float64 21 oh_education_PhD 8817 non-null float64 22 oh_status_Married 8817 non-null float64 23 oh_status_Single 8817 non-null float64 24 oh_status_Together 8817 non-null float64 25 oh_status_Widow 8817 non-null float64 26 oh_gender_M 8817 non-null float64 27 oh_dependents_1.0 8817 non-null float64 28 oh_description_Kind of OK 8817 non-null float64 29 oh_description_Meh... 8817 non-null float64 30 oh_description_OK nice! 8817 non-null float64 31 oh_description_Take my money!! 8817 non-null float64 dtypes: boolean(1), float64(24), int64(3), object(4) memory usage: 2.2+ MB
# Save the cleaned data - Part 2
df.to_csv(os.path.join("..", "data", "cleaned_data_part2.csv"), index=False)
A tool to assist you through your exploratory data analysis¶
Optionally, you may use pandas-profiling in your data analysis.
Remember, although this tool provides excelent insights about the data you're working with, it is not enough to perform a proper analysis.
from ydata_profiling import ProfileReport
ProfileReport(
df,
title='Tugas Customer Data Preprocessed',
correlations={
"pearson": {"calculate": True},
"spearman": {"calculate": False},
"kendall": {"calculate": False},
"phi_k": {"calculate": False},
"cramers": {"calculate": False},
},
)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
